In [ ]:
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
# DIMENSION REDUCTION AND SHRINKAGE
# Import necessary libraries
! pip install numpy;
! pip install pandas;
! pip install scikit-learn;
! pip install statsmodels;
! pip install mlxtend;
! pip install ISLP;
import numpy as np
import pandas as pd
import statsmodels.api as sm
In [12]:
# Part I. Variable Selection and Ridge Regression
# Part I: Variable Selection
# 1. Variable Selection Using Exhaustive Search (like regsubsets in R)
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.linear_model import LinearRegression
# Load Boston dataset
from ISLP import load_data;
boston = load_data('boston')
print(boston)
crim zn indus chas nox rm age dis rad tax \ 0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 .. ... ... ... ... ... ... ... ... ... ... 501 0.06263 0.0 11.93 0 0.573 6.593 69.1 2.4786 1 273 502 0.04527 0.0 11.93 0 0.573 6.120 76.7 2.2875 1 273 503 0.06076 0.0 11.93 0 0.573 6.976 91.0 2.1675 1 273 504 0.10959 0.0 11.93 0 0.573 6.794 89.3 2.3889 1 273 505 0.04741 0.0 11.93 0 0.573 6.030 80.8 2.5050 1 273 ptratio lstat medv 0 15.3 4.98 24.0 1 17.8 9.14 21.6 2 17.8 4.03 34.7 3 18.7 2.94 33.4 4 18.7 5.33 36.2 .. ... ... ... 501 21.0 9.67 22.4 502 21.0 9.08 20.6 503 21.0 5.64 23.9 504 21.0 6.48 22.0 505 21.0 7.88 11.9 [506 rows x 13 columns]
In [ ]:
# Define X and y for regression
X_boston = boston[['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat']]
y_boston = boston[['medv']]
print(X_boston,y_boston)
In [ ]:
# Initialize the linear regression model
lr = LinearRegression()
# Exhaustive feature selection
efs = EFS(lr,
min_features=1,
max_features=12,
scoring='r2',
print_progress=True,
cv=5)
efs.fit(X_boston, y_boston)
In [ ]:
# Never mind these warnings. It is all about future versions of model_selection
In [19]:
# Best subset of features
best_features = efs.best_feature_names_
print(f"Best features: {best_features}")
# Summarize results
efs_results = pd.DataFrame(efs.get_metric_dict()).T
print(efs_results[['feature_idx', 'avg_score', 'std_dev']].head())
Best features: ('crim', 'zn', 'chas', 'nox', 'age', 'dis', 'rad', 'tax', 'ptratio', 'lstat') feature_idx avg_score std_dev 0 (0,) -0.319539 0.423008 1 (1,) -0.438576 0.603464 2 (2,) -0.21937 0.154856 3 (3,) -0.64991 0.846949 4 (4,) -0.307827 0.34222
In [20]:
# 2. Stepwise Regression
def forward_selection(X, y):
initial_features = []
remaining_features = list(X.columns)
best_features = initial_features[:]
current_score, best_new_score = float('inf'), float('inf')
while remaining_features:
scores_with_candidates = []
for candidate in remaining_features:
formula = best_features + [candidate]
X_new = X[formula]
X_new = sm.add_constant(X_new)
model = sm.OLS(y, X_new).fit()
score = model.aic
scores_with_candidates.append((score, candidate))
scores_with_candidates.sort()
best_new_score, best_candidate = scores_with_candidates[0]
if current_score > best_new_score:
remaining_features.remove(best_candidate)
best_features.append(best_candidate)
current_score = best_new_score
else:
break
formula = best_features
X_final = X[formula]
X_final = sm.add_constant(X_final)
model = sm.OLS(y, X_final).fit()
return model
model = forward_selection(X_boston, y_boston)
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: medv R-squared: 0.734 Model: OLS Adj. R-squared: 0.729 Method: Least Squares F-statistic: 136.8 Date: Fri, 27 Sep 2024 Prob (F-statistic): 1.73e-135 Time: 13:41:06 Log-Likelihood: -1505.0 No. Observations: 506 AIC: 3032. Df Residuals: 495 BIC: 3078. Df Model: 10 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 41.4517 4.903 8.454 0.000 31.818 51.086 lstat -0.5465 0.047 -11.519 0.000 -0.640 -0.453 rm 3.6730 0.409 8.978 0.000 2.869 4.477 ptratio -0.9310 0.130 -7.138 0.000 -1.187 -0.675 dis -1.5160 0.188 -8.078 0.000 -1.885 -1.147 nox -18.2624 3.565 -5.122 0.000 -25.267 -11.258 chas 2.8719 0.863 3.329 0.001 1.177 4.567 zn 0.0462 0.014 3.378 0.001 0.019 0.073 crim -0.1217 0.033 -3.696 0.000 -0.186 -0.057 rad 0.2839 0.064 4.440 0.000 0.158 0.410 tax -0.0123 0.003 -3.608 0.000 -0.019 -0.006 ============================================================================== Omnibus: 172.594 Durbin-Watson: 1.074 Prob(Omnibus): 0.000 Jarque-Bera (JB): 725.971 Skew: 1.486 Prob(JB): 2.28e-158 Kurtosis: 8.060 Cond. No. 1.13e+04 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.13e+04. This might indicate that there are strong multicollinearity or other numerical problems.
In [22]:
# Part II: Ridge Regression
# To perform Ridge Regression, you can use the Ridge class from sklearn.linear_model.
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_boston, y_boston, test_size=0.2, random_state=42)
# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [23]:
# Ridge regression with different lambdas
alphas = np.linspace(0, 10, 100)
ridge_models = [Ridge(alpha=alpha).fit(X_train_scaled, y_train) for alpha in alphas]
In [28]:
# Plot the coefficients against different alphas
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
for i in range(X_boston.shape[1]):
plt.plot(alphas, [ridge.coef_[0][i] for ridge in ridge_models], label=X_boston.columns[i]) # Add [0] to access coefficients correctly
plt.xlabel('Lambda')
plt.ylabel('Coefficient values')
plt.title('Ridge Regression Coefficients vs Lambda')
plt.legend()
plt.show()
In [30]:
# Choose the best lambda using cross-validation
from sklearn.linear_model import RidgeCV
ridge_cv = RidgeCV(alphas=np.linspace(0.01, 10, 100), store_cv_values=True)
ridge_cv.fit(X_train_scaled, y_train)
print(f"Best lambda: {ridge_cv.alpha_}")
Best lambda: 7.073636363636363